This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read_csv("AirBnB.csv")
## Rows: 7833 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (19): host_name, host_since_anniversary, Customer Since, neighbourhood_c...
## dbl (22): host_id, host_since_year, Age in years, id, latitude, longitude, a...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- df %>%
dplyr::select(
-host_id,
-host_name,
-host_since_year,
-host_since_anniversary,
-matches("Customer Since"),
-zipcode,
-id,
-city,
-state,
-country,
-latitude,
-longitude,
-matches("customers...50..review.rate"),
-number_of_reviews
)
df <- df %>%
rename(
age_of_exp = "Age in years",
total_revenue = "Total Rev",
rev_per_2_guest = "Daily Rev per 2 guests, unless limited to 1",
min_cost_per_night = "Min Nights",
neighbourhood = neighbourhood_cleansed,
city = city_translated,
state = state_translated,
)
df$total_revenue <- as.character(df$total_revenue)
df$total_revenue_clean <- gsub("[$,]", "", df$total_revenue)
df <- df[!is.na(suppressWarnings(as.numeric(df$total_revenue_clean))), ]
df$total_revenue_clean <- as.numeric(df$total_revenue_clean)
df$total_revenue <- NULL
df$log_total_revenue <- log(df$total_revenue_clean + 1)
df$rev_per_2_guest <- as.character(df$rev_per_2_guest)
df$rev_per_2_guest_clean <- gsub("[$,]", "", df$rev_per_2_guest)
df <- df[!is.na(suppressWarnings(as.numeric(df$rev_per_2_guest_clean))), ]
df$rev_per_2_guest_clean <- as.numeric(df$rev_per_2_guest_clean)
df$rev_per_2_guest <- NULL
df$host_response_rate <- as.character(df$host_response_rate)
df <- df[!is.na(suppressWarnings(as.numeric(df$host_response_rate))), ]
df$host_response_rate <- as.numeric(df$host_response_rate)
df$price <- as.character(df$price)
df$price <- as.numeric(gsub("[$[:space:]]", "", df$price))
## Warning: NAs introduced by coercion
df <- df[!is.na(suppressWarnings(as.numeric(df$price))), ]
df$price <- as.numeric(df$price)
df$log_price <- log(df$price + 1)
df$log_review_scores_value <- log(df$review_scores_value + 1)
df$log_age_of_exp <- log(df$age_of_exp + 1)
df$sqrt_age_of_exp <- sqrt(df$age_of_exp)
df$log_review_scores_rating <- log(df$review_scores_rating + 1)
df$log_accommodates <- log(df$accommodates + 1)
df$sqrt_accommodates <- sqrt(df$accommodates)
df$log_host_response_rate <- log(df$host_response_rate + 1)
df$minimum_nights <- as.numeric(gsub("[$[:space:]]", "", df$minimum_nights))
df$neighbourhood <- NULL
df$city <- NULL
df$state <- NULL
df$bathrooms <- NULL
df$bedrooms <- NULL
df$beds <- NULL
df$bed_type <- NULL
df$guests_included <- NULL
df$extra_people <- NULL
df$minimum_nights <- NULL
df$review_scores_accuracy <- NULL
df$review_scores_rating <- NULL
df$review_scores_cleaniness <- NULL
df$review_scores_checkin <- NULL
df$review_scores_communication <- NULL
df$review_scores_location <- NULL
df <- na.omit(df)
head(df)
## # A tibble: 6 × 21
## age_of_exp property_type room_type accommodates price min_cost_per_night
## <dbl> <chr> <chr> <dbl> <dbl> <chr>
## 1 8.93 Apartment Entire home/apt 4 130 $520
## 2 8.8 Apartment Private room 2 59 $207
## 3 8.74 Apartment Entire home/apt 4 95 $285
## 4 8.62 Apartment Entire home/apt 2 100 $220
## 5 8.57 Apartment Entire home/apt 6 250 $500
## 6 8.57 Apartment Private room 2 115 $115
## # ℹ 15 more variables: host_response_time <chr>, host_response_rate <dbl>,
## # review_scores_cleanliness <dbl>, review_scores_value <dbl>,
## # total_revenue_clean <dbl>, log_total_revenue <dbl>,
## # rev_per_2_guest_clean <dbl>, log_price <dbl>,
## # log_review_scores_value <dbl>, log_age_of_exp <dbl>, sqrt_age_of_exp <dbl>,
## # log_review_scores_rating <dbl>, log_accommodates <dbl>,
## # sqrt_accommodates <dbl>, log_host_response_rate <dbl>
hist(df$total_revenue_clean/1000, breaks=40, main = "Histogram of the distribution of Total Revenue", xlab = "Total Revenue ($1000)") # Not normally distributed, likely not good for linear regression unless undergoes transformation
<<<<<<< HEAD
hist(df$log_total_revenue, breaks=20, main = "Histogram of the distribution of Log Total Revenue") # Looks more normally distributed, more fit for linear regression, thus we investigate log total revenue
boxplot(df$log_total_revenue, main = "Boxplot of the distribution of Log Total Revenue", ylab = "Log Total Revenue") # helps better visualize distribution is less skewed
hist(df$log_total_revenue, breaks=20, main = "Histogram of the distribution of Log Total Revenue") # Looks more normally distributed, more fit for linear regression, thus we investigate log total revenue
boxplot(df$log_total_revenue, main = "Boxplot of the distribution of Log Total Revenue", ylab = "Log Total Revenue") # helps better visualize distribution is less skewed
Checking Property Type
df_property_type <- df %>%
add_count(property_type) %>%
filter(n >= 10) %>%
select(-n)
boxplot(log_total_revenue ~ property_type, data = df_property_type,
main = "Log Revenue by Property Type",
ylab = "Log Total Revenue", las = 1, cex.axis = 0.7)
<<<<<<< HEAD
# Not Very Suitable by observation
check if room_type is good for adding into our model
boxplot(log_total_revenue ~ room_type, data = df,
main = "Log Revenue by Room Type",
ylab = "Log Total Revenue", xlab = "Room Type", las = 1, cex.axis = 0.7)
<<<<<<< HEAD
# We see a good distinguishing factor between all 3 variables, especially
# between shared room and others
check if host_response_time is good for adding into our model
boxplot(log_total_revenue ~ host_response_time, data = df,
main = "Log Revenue by host response time",
ylab = "Log Total Revenue", las = 1, cex.axis = 0.7)
<<<<<<< HEAD
# We see a non-trivial relationship here, but it is marginal compared to
# room_type as a categorical predictor
from the box plot, we think room type is the best categorical data to use in our model
##
## Call:
## lm(formula = log_total_revenue ~ log_age_of_exp + log_price +
## review_scores_value + sqrt_accommodates + host_response_rate +
## room_type, data = as.data.frame(df), y = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9443 -0.8645 0.0594 0.9042 4.0434
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.10429 0.29234 -0.357 0.721
## log_age_of_exp 2.30491 0.07910 29.139 < 2e-16 ***
## log_price 0.48994 0.04546 10.776 < 2e-16 ***
## review_scores_value 0.10600 0.01919 5.524 3.46e-08 ***
## sqrt_accommodates 0.31886 0.04735 6.734 1.81e-11 ***
## host_response_rate 0.94364 0.11170 8.448 < 2e-16 ***
## room_typePrivate room -0.07659 0.04814 -1.591 0.112
## room_typeShared room -0.87323 0.21942 -3.980 6.98e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 5694 degrees of freedom
## Multiple R-squared: 0.2165, Adjusted R-squared: 0.2156
## F-statistic: 224.8 on 7 and 5694 DF, p-value: < 2.2e-16
plot(log_model_with_cat$fitted.values, df$log_total_revenue, cex = 0.01, abline(0,1, col="red"), main = "Fitted Log Total Revenue vs Actual Log Total Revenue", xlab = "Fitted Log Total Revenue", ylab = "Actual Log Total Revenue")
<<<<<<< HEAD
Residual vs Fitted Plot: Looking for zero-mean, uni-variance
distribution
<<<<<<< HEAD
Attempt to optimize model with further Box-Cox transformation on top of log.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(psych)
b <- boxcox(log_model_with_cat)
<<<<<<< HEAD
lambda <- b$x[which.max(b$y)]
lambda
## [1] 1.151515
geom_mean <- geometric.mean(df$log_total_revenue) # number of observations without NA
log_total_revenue_transformed <- geom_mean ^ (1-lambda) * (df$log_total_revenue^lambda - 1) / lambda
df$log_total_revenue_transformed <- log_total_revenue_transformed
box_cox_log_model_with_cat <- lm(
log_total_revenue_transformed ~ log_age_of_exp + log_price + review_scores_value + sqrt_accommodates + host_response_rate + room_type,
data = df, y = TRUE
)
summary(box_cox_log_model_with_cat)
##
## Call:
## lm(formula = log_total_revenue_transformed ~ log_age_of_exp +
## log_price + review_scores_value + sqrt_accommodates + host_response_rate +
## room_type, data = df, y = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8632 -0.8717 0.0444 0.8959 4.2136
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.80055 0.29219 -6.162 7.67e-10 ***
## log_age_of_exp 2.31110 0.07906 29.233 < 2e-16 ***
## log_price 0.48899 0.04544 10.761 < 2e-16 ***
## review_scores_value 0.10537 0.01918 5.494 4.10e-08 ***
## sqrt_accommodates 0.32137 0.04733 6.790 1.23e-11 ***
## host_response_rate 0.94504 0.11164 8.465 < 2e-16 ***
## room_typePrivate room -0.07089 0.04812 -1.473 0.140728
## room_typeShared room -0.85165 0.21931 -3.883 0.000104 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 5694 degrees of freedom
## Multiple R-squared: 0.2168, Adjusted R-squared: 0.2159
## F-statistic: 225.2 on 7 and 5694 DF, p-value: < 2.2e-16
plot(box_cox_log_model_with_cat, which = 1, cex = 0.01) # Minimal improvement observed
<<<<<<< HEAD
plot(box_cox_log_model_with_cat, which = 2, cex=0.7) # Minimal improvement observed
plot(box_cox_log_model_with_cat, which = 2, cex=0.7) # Minimal improvement observed
We see that Box-Cox does not help out too much in our residuals, and it also reduces interpretability of our model by adding complexity to our transformed predictor.